import os
import numpy as np
import calendar
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pooch # download data / avoid re-downloading
from IPython import get_ipython
sns.set_palette("colorblind")
pd.options.display.max_rows = 8Pandas
This lecture is extracted and adapted from the work by Joris Van den Bossche https://github.com/jorisvandenbossche/pandas-tutorial/blob/master/01-pandas_introduction.ipynb
For R users, you might also want to read https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_r.html for a smooth start.
Dataset 1: Titanic dataset
First, it is important to download automatically remote files for reproducibility (and avoid typing names manually)
url = "http://josephsalmon.eu/enseignement/datasets/titanic.csv"
path_target = "./titanic.csv"
path, fname = os.path.split(path_target)
pooch.retrieve(url, path=path, fname=fname, known_hash=None) # if needed `pip install pooch`'/home/jsalmon/Documents/Mes_cours/Montpellier/HAX712X/Courses/Pandas/titanic.csv'
Reading the file as a pandas dataframe:
df_titanic_raw = pd.read_csv("titanic.csv")Visualize the end of the dataset:
df_titanic_raw.tail(n=3)| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.45 | NaN | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.00 | C148 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.75 | NaN | Q |
Visualize the beginning of the dataset:
df_titanic_raw.head(n=5)| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
Missing values
It is common to encounter features/covariates with missing values. In pandas they were mostly handled as np.nan (not a number). In the future, they will be treated as NA (note available), in a similar way as in R; see for standard behavior and details https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html.
Note that the main difference between pd.NA and np.nan is that pd.NA propagates even for comparisons:
pd.NA == 1<NA>
whereas
np.nan == 1False
Testing the presence of missing values
pd.isna(pd.NA)
pd.isna(np.nan)True
The simplest strategy (when you can / when you have enough samples) consists in removing all nans/NAs.
df_titanic = df_titanic_raw.dropna()
df_titanic.tail(3)| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 879 | 880 | 1 | 1 | Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) | female | 56.0 | 0 | 1 | 11767 | 83.1583 | C50 | C |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
# Useful info on the dataset (especially missing values!)
df_titanic.info()<class 'pandas.core.frame.DataFrame'>
Index: 183 entries, 1 to 889
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 183 non-null int64
1 Survived 183 non-null int64
2 Pclass 183 non-null int64
3 Name 183 non-null object
4 Sex 183 non-null object
5 Age 183 non-null float64
6 SibSp 183 non-null int64
7 Parch 183 non-null int64
8 Ticket 183 non-null object
9 Fare 183 non-null float64
10 Cabin 183 non-null object
11 Embarked 183 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 18.6+ KB
# Check that cabin is mostly missing, also the age
df_titanic_raw.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Description of the titanic.csv dataset
More details can be found here: https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic3info.txt
Survived: Survival 0 = No, 1 = YesPclass: Ticket class 1 = 1st, 2 = 2nd, 3 = 3rdSex: Sex male/femaleAge: Age in yearsSibsp: # of siblings / spouses aboard the TitanicParch: # of parents / children aboard the TitanicTicket: Ticket numberFare: Passenger fareCabin: Cabin numberEmbarked: Port of Embarkation C = Cherbourg, Q = Queenstown, S = SouthamptonName: Name of the passengerPassengerId: Number to identify passenger
Note: an extended version of the dataset is available here for those interested https://biostat.app.vumc.org/wiki/pub/Main/DataSets/titanic.txt.
Simple descriptive statistics
df_titanic.describe()| PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| count | 183.000000 | 183.000000 | 183.000000 | 183.000000 | 183.000000 | 183.000000 | 183.000000 |
| mean | 455.366120 | 0.672131 | 1.191257 | 35.674426 | 0.464481 | 0.475410 | 78.682469 |
| std | 247.052476 | 0.470725 | 0.515187 | 15.643866 | 0.644159 | 0.754617 | 76.347843 |
| min | 2.000000 | 0.000000 | 1.000000 | 0.920000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 263.500000 | 0.000000 | 1.000000 | 24.000000 | 0.000000 | 0.000000 | 29.700000 |
| 50% | 457.000000 | 1.000000 | 1.000000 | 36.000000 | 0.000000 | 0.000000 | 57.000000 |
| 75% | 676.000000 | 1.000000 | 1.000000 | 47.500000 | 1.000000 | 1.000000 | 90.000000 |
| max | 890.000000 | 1.000000 | 3.000000 | 80.000000 | 3.000000 | 4.000000 | 512.329200 |
Visualization
- Histograms (please avoid…often useless)
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
ax.hist(df_titanic['Age'], density=True, bins=25)
plt.xlabel('Age')
plt.ylabel('Proportion')
plt.title("Passager age histogram")Text(0.5, 1.0, 'Passager age histogram')

- Kernel Density Estimate (KDE): :
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
sns.kdeplot(
df_titanic["Age"], ax=ax, fill=True, cut=0, bw_adjust=0.1
)
plt.xlabel("Proportion")
plt.ylabel("Age")
plt.title("Passager age kernel density estimate")
plt.tight_layout()
Note: the bandwidbw_adjustth (here bw_adjust) controls the smoothing level.
- Swarmplot:
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
sns.swarmplot(
data=df_titanic_raw,
ax=ax,
x="Sex",
y="Age",
hue="Survived",
palette={0: "r", 1: "k"},
order=["female", "male"],
)
plt.title("Passager age by gender/survival")
plt.legend(labels=["Died", "Survived"], loc="upper left")
plt.tight_layout()/home/jsalmon/anaconda3/envs/peerannot/lib/python3.10/site-packages/seaborn/categorical.py:3544: UserWarning:
6.8% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.

EXERCISE: density over histogram
Plot the density estimate over the histogram
Widgets
Interactive interaction with codes and output is nowadays easier and easier (see also Shiny app in R-software). In python, one can use widgets and the interact package for this purpose. We are going to visualize that on the simple KDE and histogram examples.
XXX -> to pyplot
def hist_explore(
dataset=df_titanic,
variable=df_titanic.columns,
n_bins=24,
alpha=0.25,
density=False,
):
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
ax.hist(
dataset[variable], density=density, bins=n_bins, alpha=alpha
) # standardization
plt.ylabel("Density level")
plt.title(f"Dataset {dataset.attrs['name']}:\n Histogram for passengers' age")
plt.tight_layout()
plt.show()
interact(
hist_explore,
dataset=fixed(df_titanic),
n_bins=(1, 50, 1),
alpha=(0, 1, 0.1),
density=False,
)def kde_explore(dataset=df_titanic, variable=df_titanic.columns, bw=5):
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
sns.kdeplot(dataset[variable], bw_adjust=bw, shade=True, cut=0, ax=ax)
plt.ylabel("Density level")
plt.title(f"Dataset {dataset.attrs['name']}:\n KDE for passengers' {variable}")
plt.tight_layout()
plt.show()
interact(kde_explore, dataset=fixed(df_titanic), bw=(0.001, 2, 0.01))Groupby function
How does the survival rate change w.r.t. to sex?
df_titanic_raw.groupby('Sex')[['Survived']].aggregate(lambda x: x.mean())| Survived | |
|---|---|
| Sex | |
| female | 0.742038 |
| male | 0.188908 |
How does the survival rate change w.r.t. the class?
df_titanic.columnsIndex(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
fig, ax = plt.subplots(1, 1, figsize=(5, 5))
df_titanic.groupby('Pclass')['Survived'].aggregate(lambda x:
x.mean()).plot(ax=ax,kind='bar')
plt.xlabel('Classe')
plt.ylabel('Taux de survie')
plt.title('Taux de survie par classe')
plt.show()
EXERCISE: median by class
Perform a similar analysis with the median for the price per class in pounds.
catplot: a visual groupby
ax=sns.catplot(
x="Pclass",
y="Age",
hue="Sex",
palette={'female': 'red', 'male': 'b'},
data=df_titanic_raw,
jitter = '0.2',
s=8,
)
sns.move_legend(ax, "upper left", bbox_to_anchor=(0.8, 0.8))
ax=sns.catplot(
x="Pclass",
y="Age",
hue="Sex",
palette={'female': 'red', 'male': 'b'},
alpha=0.8,
data=df_titanic_raw,
kind='swarm',
s=11,
)
sns.move_legend(ax, "upper left", bbox_to_anchor=(0.8, 0.8))
ax=sns.catplot(
x="Sex",
y="Age",
hue="Sex",
palette={'female': 'red', 'male': 'b'},
col='Pclass',
alpha=0.8,
data=df_titanic_raw,
kind='swarm',
s=6,
height=5,
aspect=0.35
)
ax=sns.catplot(x='Pclass',
y='Age',
hue="Sex",
palette={'female': 'red', 'male': 'b'},
data=df_titanic_raw,
kind="violin",
alpha=0.8,
)
sns.move_legend(ax, "upper left", bbox_to_anchor=(0.8, 0.8))
Beware: large difference in sex ratio by class
df_titanic_raw.groupby(['Sex', 'Pclass'])[['Sex']].count()
df_titanic_raw.groupby(['Sex'])[['Sex']].count()| Sex | |
|---|---|
| Sex | |
| female | 314 |
| male | 577 |
More on groupby pandas-kungfu: cf. also pd.crosstab, etc. https://pbpython.com/groupby-agg.html
pd.crosstab
pd.crosstab(
df_titanic_raw["Sex"],
df_titanic_raw["Pclass"],
values=df_titanic_raw["Sex"],
aggfunc="count",
normalize=False,
)| Pclass | 1 | 2 | 3 |
|---|---|---|---|
| Sex | |||
| female | 94 | 76 | 144 |
| male | 122 | 108 | 347 |
df_titanic| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S |
| 10 | 11 | 1 | 3 | Sandstrom, Miss. Marguerite Rut | female | 4.0 | 1 | 1 | PP 9549 | 16.7000 | G6 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 872 | 873 | 0 | 1 | Carlsson, Mr. Frans Olof | male | 33.0 | 0 | 0 | 695 | 5.0000 | B51 B53 B55 | S |
| 879 | 880 | 1 | 1 | Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) | female | 56.0 | 0 | 1 | 11767 | 83.1583 | C50 | C |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
df_titanic.indexIndex([ 1, 3, 6, 10, 11, 21, 23, 27, 52, 54,
...
835, 853, 857, 862, 867, 871, 872, 879, 887, 889],
dtype='int64', length=183)
df_titanic.columnsIndex(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
pd.options.display.max_rows = 12
df_titanic.dtypes
df_titanic['Name'].astype(str)1 Cumings, Mrs. John Bradley (Florence Briggs Th...
3 Futrelle, Mrs. Jacques Heath (Lily May Peel)
6 McCarthy, Mr. Timothy J
10 Sandstrom, Miss. Marguerite Rut
11 Bonnell, Miss. Elizabeth
...
871 Beckwith, Mrs. Richard Leonard (Sallie Monypeny)
872 Carlsson, Mr. Frans Olof
879 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)
887 Graham, Miss. Margaret Edith
889 Behr, Mr. Karl Howell
Name: Name, Length: 183, dtype: object
Extract numpy arrays from dataframes
useful for using packages on top of pandas (e.g., sklearn, though nowadays it works out of the box with pandas)
array_titanic = df_titanic.values # associated numpy array
array_titanicarray([[2, 1, 1, ..., 71.2833, 'C85', 'C'],
[4, 1, 1, ..., 53.1, 'C123', 'S'],
[7, 0, 1, ..., 51.8625, 'E46', 'S'],
...,
[880, 1, 1, ..., 83.1583, 'C50', 'C'],
[888, 1, 1, ..., 30.0, 'B42', 'S'],
[890, 1, 1, ..., 30.0, 'C148', 'C']], dtype=object)
EXERCISE: dropna
Perform the following operation: remove the columns Cabin from the raw dataset, and then remove the rows with the variable Age missing.
Hint: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html
1D dataset: Series (a column of a DataFrame)
A Series is a labeled 1D column of a kind.
fare = df_titanic['Fare']
fare1 71.2833
3 53.1000
6 51.8625
10 16.7000
11 26.5500
...
871 52.5542
872 5.0000
879 83.1583
887 30.0000
889 30.0000
Name: Fare, Length: 183, dtype: float64
Attributes Series: indices and values
fare.values[:10]array([ 71.2833, 53.1 , 51.8625, 16.7 , 26.55 , 13. ,
35.5 , 263. , 76.7292, 61.9792])
Contrarily to numpy arrays, you can index with other formats than integers:
# Be careful, what follows changes the indexing
df_titanic_raw = df_titanic_raw.set_index('Name')
df_titanic_raw| PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Name | |||||||||||
| Braund, Mr. Owen Harris | 1 | 0 | 3 | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| Cumings, Mrs. John Bradley (Florence Briggs Thayer) | 2 | 1 | 1 | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| Heikkinen, Miss. Laina | 3 | 1 | 3 | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| Futrelle, Mrs. Jacques Heath (Lily May Peel) | 4 | 1 | 1 | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| Allen, Mr. William Henry | 5 | 0 | 3 | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Montvila, Rev. Juozas | 887 | 0 | 2 | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
| Graham, Miss. Margaret Edith | 888 | 1 | 1 | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
| Johnston, Miss. Catherine Helen "Carrie" | 889 | 0 | 3 | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
| Behr, Mr. Karl Howell | 890 | 1 | 1 | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
| Dooley, Mr. Patrick | 891 | 0 | 3 | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
age = df_titanic_raw['Age']
age['Behr, Mr. Karl Howell']26.0
age.mean()29.69911764705882
df_titanic_raw[age < 2]| PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Name | |||||||||||
| Caldwell, Master. Alden Gates | 79 | 1 | 2 | male | 0.83 | 0 | 2 | 248738 | 29.0000 | NaN | S |
| Panula, Master. Eino Viljami | 165 | 0 | 3 | male | 1.00 | 4 | 1 | 3101295 | 39.6875 | NaN | S |
| Johnson, Miss. Eleanor Ileen | 173 | 1 | 3 | female | 1.00 | 1 | 1 | 347742 | 11.1333 | NaN | S |
| Becker, Master. Richard F | 184 | 1 | 2 | male | 1.00 | 2 | 1 | 230136 | 39.0000 | F4 | S |
| Allison, Master. Hudson Trevor | 306 | 1 | 1 | male | 0.92 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Hamalainen, Master. Viljo | 756 | 1 | 2 | male | 0.67 | 1 | 1 | 250649 | 14.5000 | NaN | S |
| Dean, Master. Bertram Vere | 789 | 1 | 3 | male | 1.00 | 1 | 2 | C.A. 2315 | 20.5750 | NaN | S |
| Thomas, Master. Assad Alexander | 804 | 1 | 3 | male | 0.42 | 0 | 1 | 2625 | 8.5167 | NaN | C |
| Mallet, Master. Andre | 828 | 1 | 2 | male | 1.00 | 0 | 2 | S.C./PARIS 2079 | 37.0042 | NaN | C |
| Richards, Master. George Sibley | 832 | 1 | 2 | male | 0.83 | 1 | 1 | 29106 | 18.7500 | NaN | S |
# You can come back to the original indexing
df_titanic_raw = df_titanic_raw.reset_index()Counting values for categorical variables
df_titanic_raw['Embarked'].value_counts(normalize=False, sort=True,
ascending=False)Embarked
S 644
C 168
Q 77
Name: count, dtype: int64
pd.options.display.max_rows = 70
df_titanic[df_titanic['Embarked'] == 'C']| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 52 | 53 | 1 | 1 | Harper, Mrs. Henry Sleeper (Myna Haxtun) | female | 49.0 | 1 | 0 | PC 17572 | 76.7292 | D33 | C |
| 54 | 55 | 0 | 1 | Ostby, Mr. Engelhart Cornelius | male | 65.0 | 0 | 1 | 113509 | 61.9792 | B30 | C |
| 96 | 97 | 0 | 1 | Goldschmidt, Mr. George B | male | 71.0 | 0 | 0 | PC 17754 | 34.6542 | A5 | C |
| 97 | 98 | 1 | 1 | Greenfield, Mr. William Bertram | male | 23.0 | 0 | 1 | PC 17759 | 63.3583 | D10 D12 | C |
| 118 | 119 | 0 | 1 | Baxter, Mr. Quigg Edmond | male | 24.0 | 0 | 1 | PC 17558 | 247.5208 | B58 B60 | C |
| 139 | 140 | 0 | 1 | Giglio, Mr. Victor | male | 24.0 | 0 | 0 | PC 17593 | 79.2000 | B86 | C |
| 174 | 175 | 0 | 1 | Smith, Mr. James Clinch | male | 56.0 | 0 | 0 | 17764 | 30.6958 | A7 | C |
| 177 | 178 | 0 | 1 | Isham, Miss. Ann Elizabeth | female | 50.0 | 0 | 0 | PC 17595 | 28.7125 | C49 | C |
| 194 | 195 | 1 | 1 | Brown, Mrs. James Joseph (Margaret Tobin) | female | 44.0 | 0 | 0 | PC 17610 | 27.7208 | B4 | C |
| 195 | 196 | 1 | 1 | Lurette, Miss. Elise | female | 58.0 | 0 | 0 | PC 17569 | 146.5208 | B80 | C |
| 209 | 210 | 1 | 1 | Blank, Mr. Henry | male | 40.0 | 0 | 0 | 112277 | 31.0000 | A31 | C |
| 215 | 216 | 1 | 1 | Newell, Miss. Madeleine | female | 31.0 | 1 | 0 | 35273 | 113.2750 | D36 | C |
| 218 | 219 | 1 | 1 | Bazzani, Miss. Albina | female | 32.0 | 0 | 0 | 11813 | 76.2917 | D15 | C |
| 273 | 274 | 0 | 1 | Natsch, Mr. Charles H | male | 37.0 | 0 | 1 | PC 17596 | 29.7000 | C118 | C |
| 291 | 292 | 1 | 1 | Bishop, Mrs. Dickinson H (Helen Walton) | female | 19.0 | 1 | 0 | 11967 | 91.0792 | B49 | C |
| 292 | 293 | 0 | 2 | Levy, Mr. Rene Jacques | male | 36.0 | 0 | 0 | SC/Paris 2163 | 12.8750 | D | C |
| 299 | 300 | 1 | 1 | Baxter, Mrs. James (Helene DeLaudeniere Chaput) | female | 50.0 | 0 | 1 | PC 17558 | 247.5208 | B58 B60 | C |
| 307 | 308 | 1 | 1 | Penasco y Castellana, Mrs. Victor de Satode (M... | female | 17.0 | 1 | 0 | PC 17758 | 108.9000 | C65 | C |
| 309 | 310 | 1 | 1 | Francatelli, Miss. Laura Mabel | female | 30.0 | 0 | 0 | PC 17485 | 56.9292 | E36 | C |
| 310 | 311 | 1 | 1 | Hays, Miss. Margaret Bechstein | female | 24.0 | 0 | 0 | 11767 | 83.1583 | C54 | C |
| 311 | 312 | 1 | 1 | Ryerson, Miss. Emily Borie | female | 18.0 | 2 | 2 | PC 17608 | 262.3750 | B57 B59 B63 B66 | C |
| 319 | 320 | 1 | 1 | Spedden, Mrs. Frederic Oakley (Margaretta Corn... | female | 40.0 | 1 | 1 | 16966 | 134.5000 | E34 | C |
| 325 | 326 | 1 | 1 | Young, Miss. Marie Grice | female | 36.0 | 0 | 0 | PC 17760 | 135.6333 | C32 | C |
| 329 | 330 | 1 | 1 | Hippach, Miss. Jean Gertrude | female | 16.0 | 0 | 1 | 111361 | 57.9792 | B18 | C |
| 337 | 338 | 1 | 1 | Burns, Miss. Elizabeth Margaret | female | 41.0 | 0 | 0 | 16966 | 134.5000 | E40 | C |
| 366 | 367 | 1 | 1 | Warren, Mrs. Frank Manley (Anna Sophia Atkinson) | female | 60.0 | 1 | 0 | 110813 | 75.2500 | D37 | C |
| 369 | 370 | 1 | 1 | Aubart, Mme. Leontine Pauline | female | 24.0 | 0 | 0 | PC 17477 | 69.3000 | B35 | C |
| 370 | 371 | 1 | 1 | Harder, Mr. George Achilles | male | 25.0 | 1 | 0 | 11765 | 55.4417 | E50 | C |
| 377 | 378 | 0 | 1 | Widener, Mr. Harry Elkins | male | 27.0 | 0 | 2 | 113503 | 211.5000 | C82 | C |
| 393 | 394 | 1 | 1 | Newell, Miss. Marjorie | female | 23.0 | 1 | 0 | 35273 | 113.2750 | D36 | C |
| 452 | 453 | 0 | 1 | Foreman, Mr. Benjamin Laventall | male | 30.0 | 0 | 0 | 113051 | 27.7500 | C111 | C |
| 453 | 454 | 1 | 1 | Goldenberg, Mr. Samuel L | male | 49.0 | 1 | 0 | 17453 | 89.1042 | C92 | C |
| 473 | 474 | 1 | 2 | Jerwan, Mrs. Amin S (Marie Marthe Thuillard) | female | 23.0 | 0 | 0 | SC/AH Basle 541 | 13.7917 | D | C |
| 484 | 485 | 1 | 1 | Bishop, Mr. Dickinson H | male | 25.0 | 1 | 0 | 11967 | 91.0792 | B49 | C |
| 487 | 488 | 0 | 1 | Kent, Mr. Edward Austin | male | 58.0 | 0 | 0 | 11771 | 29.7000 | B37 | C |
| 496 | 497 | 1 | 1 | Eustis, Miss. Elizabeth Mussey | female | 54.0 | 1 | 0 | 36947 | 78.2667 | D20 | C |
| 505 | 506 | 0 | 1 | Penasco y Castellana, Mr. Victor de Satode | male | 18.0 | 1 | 0 | PC 17758 | 108.9000 | C65 | C |
| 523 | 524 | 1 | 1 | Hippach, Mrs. Louis Albert (Ida Sophia Fischer) | female | 44.0 | 0 | 1 | 111361 | 57.9792 | B18 | C |
| 539 | 540 | 1 | 1 | Frolicher, Miss. Hedwig Margaritha | female | 22.0 | 0 | 2 | 13568 | 49.5000 | B39 | C |
| 544 | 545 | 0 | 1 | Douglas, Mr. Walter Donald | male | 50.0 | 1 | 0 | PC 17761 | 106.4250 | C86 | C |
| 550 | 551 | 1 | 1 | Thayer, Mr. John Borland Jr | male | 17.0 | 0 | 2 | 17421 | 110.8833 | C70 | C |
| 556 | 557 | 1 | 1 | Duff Gordon, Lady. (Lucille Christiana Sutherl... | female | 48.0 | 1 | 0 | 11755 | 39.6000 | A16 | C |
| 581 | 582 | 1 | 1 | Thayer, Mrs. John Borland (Marian Longstreth M... | female | 39.0 | 1 | 1 | 17421 | 110.8833 | C68 | C |
| 583 | 584 | 0 | 1 | Ross, Mr. John Hugo | male | 36.0 | 0 | 0 | 13049 | 40.1250 | A10 | C |
| 587 | 588 | 1 | 1 | Frolicher-Stehli, Mr. Maxmillian | male | 60.0 | 1 | 1 | 13567 | 79.2000 | B41 | C |
| 591 | 592 | 1 | 1 | Stephenson, Mrs. Walter Bertram (Martha Eustis) | female | 52.0 | 1 | 0 | 36947 | 78.2667 | D20 | C |
| 599 | 600 | 1 | 1 | Duff Gordon, Sir. Cosmo Edmund ("Mr Morgan") | male | 49.0 | 1 | 0 | PC 17485 | 56.9292 | A20 | C |
| 632 | 633 | 1 | 1 | Stahelin-Maeglin, Dr. Max | male | 32.0 | 0 | 0 | 13214 | 30.5000 | B50 | C |
| 641 | 642 | 1 | 1 | Sagesser, Mlle. Emma | female | 24.0 | 0 | 0 | PC 17477 | 69.3000 | B35 | C |
| 645 | 646 | 1 | 1 | Harper, Mr. Henry Sleeper | male | 48.0 | 1 | 0 | PC 17572 | 76.7292 | D33 | C |
| 647 | 648 | 1 | 1 | Simonius-Blumer, Col. Oberst Alfons | male | 56.0 | 0 | 0 | 13213 | 35.5000 | A26 | C |
| 659 | 660 | 0 | 1 | Newell, Mr. Arthur Webster | male | 58.0 | 0 | 2 | 35273 | 113.2750 | D48 | C |
| 679 | 680 | 1 | 1 | Cardeza, Mr. Thomas Drake Martinez | male | 36.0 | 0 | 1 | PC 17755 | 512.3292 | B51 B53 B55 | C |
| 681 | 682 | 1 | 1 | Hassab, Mr. Hammad | male | 27.0 | 0 | 0 | PC 17572 | 76.7292 | D49 | C |
| 698 | 699 | 0 | 1 | Thayer, Mr. John Borland | male | 49.0 | 1 | 1 | 17421 | 110.8833 | C68 | C |
| 700 | 701 | 1 | 1 | Astor, Mrs. John Jacob (Madeleine Talmadge Force) | female | 18.0 | 1 | 0 | PC 17757 | 227.5250 | C62 C64 | C |
| 710 | 711 | 1 | 1 | Mayne, Mlle. Berthe Antonine ("Mrs de Villiers") | female | 24.0 | 0 | 0 | PC 17482 | 49.5042 | C90 | C |
| 716 | 717 | 1 | 1 | Endres, Miss. Caroline Louise | female | 38.0 | 0 | 0 | PC 17757 | 227.5250 | C45 | C |
| 737 | 738 | 1 | 1 | Lesurer, Mr. Gustave J | male | 35.0 | 0 | 0 | PC 17755 | 512.3292 | B101 | C |
| 742 | 743 | 1 | 1 | Ryerson, Miss. Susan Parker "Suzette" | female | 21.0 | 2 | 2 | PC 17608 | 262.3750 | B57 B59 B63 B66 | C |
| 789 | 790 | 0 | 1 | Guggenheim, Mr. Benjamin | male | 46.0 | 0 | 0 | PC 17593 | 79.2000 | B82 B84 | C |
| 835 | 836 | 1 | 1 | Compton, Miss. Sara Rebecca | female | 39.0 | 1 | 1 | PC 17756 | 83.1583 | E49 | C |
| 879 | 880 | 1 | 1 | Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) | female | 56.0 | 0 | 1 | 11767 | 83.1583 | C50 | C |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
Comments: not all passengers from Cherbourg are Gallic (🇫🇷: gaulois) …
What is the survival rate for raw data?
df_titanic_raw['Survived'].mean()0.3838383838383838
What is the survival rate for data after removing missing values?
df_titanic['Survived'].mean()0.6721311475409836
Conclusion: be careful when you remove some missing values, the missingness might be informative!
EXERCISE : sex ratio on the boat
Q: What was the proportion of women on the boat?
See also the command:
df_titanic.groupby(['Sex'])[['Survived', 'Age', 'Fare']].mean()| Survived | Age | Fare | |
|---|---|---|---|
| Sex | |||
| female | 0.931818 | 32.676136 | 89.000900 |
| male | 0.431579 | 38.451789 | 69.124343 |
Data import et export
The Pandas library supports many formats: - CSV, text - SQL database - Excel - HDF5 - json - html - pickle - sas, stata - …
Exploration
pd.options.display.max_rows = 8
df_titanic_raw.tail()| Name | PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 886 | Montvila, Rev. Juozas | 887 | 0 | 2 | male | 27.0 | 0 | 0 | 211536 | 13.00 | NaN | S |
| 887 | Graham, Miss. Margaret Edith | 888 | 1 | 1 | female | 19.0 | 0 | 0 | 112053 | 30.00 | B42 | S |
| 888 | Johnston, Miss. Catherine Helen "Carrie" | 889 | 0 | 3 | female | NaN | 1 | 2 | W./C. 6607 | 23.45 | NaN | S |
| 889 | Behr, Mr. Karl Howell | 890 | 1 | 1 | male | 26.0 | 0 | 0 | 111369 | 30.00 | C148 | C |
| 890 | Dooley, Mr. Patrick | 891 | 0 | 3 | male | 32.0 | 0 | 0 | 370376 | 7.75 | NaN | Q |
df_titanic_raw.head()| Name | PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Braund, Mr. Owen Harris | 1 | 0 | 3 | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | 2 | 1 | 1 | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | Heikkinen, Miss. Laina | 3 | 1 | 3 | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 4 | 1 | 1 | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | Allen, Mr. William Henry | 5 | 0 | 3 | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
Access values by line/columns etc.
iloc
df_titanic_raw.iloc[0:2, 1:8]| PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | male | 22.0 | 1 | 0 |
| 1 | 2 | 1 | 1 | female | 38.0 | 1 | 0 |
loc
# with original index:
# df_titanic_raw.loc[128]
# with naming indexing
df_titanic_raw = df_titanic_raw.set_index('Name') # you can only do it once !!
df_titanic_raw.loc['Bonnell, Miss. Elizabeth', 'Fare']26.55
df_titanic_raw.loc['Bonnell, Miss. Elizabeth']PassengerId 12
Survived 1
Pclass 1
Sex female
...
Ticket 113783
Fare 26.55
Cabin C103
Embarked S
Name: Bonnell, Miss. Elizabeth, Length: 11, dtype: object
df_titanic_raw.loc['Bonnell, Miss. Elizabeth', 'Survived']
df_titanic_raw.loc['Bonnell, Miss. Elizabeth', 'Survived'] = 0df_titanic_raw.loc['Bonnell, Miss. Elizabeth']PassengerId 12
Survived 0
Pclass 1
Sex female
...
Ticket 113783
Fare 26.55
Cabin C103
Embarked S
Name: Bonnell, Miss. Elizabeth, Length: 11, dtype: object
# set back the original value
df_titanic_raw.loc['Bonnell, Miss. Elizabeth', 'Survived'] = 1
df_titanic_raw = df_titanic_raw.reset_index() # come back to original indexgroupby
df_titanic.groupby(['Sex'])[['Survived', 'Age', 'Fare', 'Pclass']].mean()| Survived | Age | Fare | Pclass | |
|---|---|---|---|---|
| Sex | ||||
| female | 0.931818 | 32.676136 | 89.000900 | 1.215909 |
| male | 0.431579 | 38.451789 | 69.124343 | 1.168421 |
Create binned values
bins=np.arange(0, 100, 10)
current_palette = sns.color_palette()
df_test = pd.DataFrame({ 'Age': pd.cut(df_titanic['Age'], bins=bins, right=False)})
ax = sns.countplot(data=df_test, x='Age', color=current_palette[0])
ax.tick_params(axis='x', labelrotation=90)
Second Case study: air quality in Paris (Source: Airparif)
url = "http://josephsalmon.eu/enseignement/datasets/20080421_20160927-PA13_auto.csv"
path_target = "./20080421_20160927-PA13_auto.csv"
path, fname = os.path.split(path_target)
pooch.retrieve(url, path=path, fname=fname, known_hash=None)'/home/jsalmon/Documents/Mes_cours/Montpellier/HAX712X/Courses/Pandas/20080421_20160927-PA13_auto.csv'
You can run for instance in a terminal:
!head -26 ./20080421_20160927-PA13_auto.csvAlternatively:
from IPython import get_ipython
get_ipython().system('head -26 ./20080421_20160927-PA13_auto.csv')Time series help: https://jakevdp.github.io/PythonDataScienceHandbook/03.11-working-with-time-series.html
polution_df = pd.read_csv('20080421_20160927-PA13_auto.csv', sep=';',
comment='#',
na_values="n/d",
converters={'heure': str})pd.options.display.max_rows = 30
polution_df.head(25)| date | heure | NO2 | O3 | |
|---|---|---|---|---|
| 0 | 21/04/2008 | 1 | 13.0 | 74.0 |
| 1 | 21/04/2008 | 2 | 11.0 | 73.0 |
| 2 | 21/04/2008 | 3 | 13.0 | 64.0 |
| 3 | 21/04/2008 | 4 | 23.0 | 46.0 |
| 4 | 21/04/2008 | 5 | 47.0 | 24.0 |
| 5 | 21/04/2008 | 6 | 70.0 | 11.0 |
| 6 | 21/04/2008 | 7 | 70.0 | 17.0 |
| 7 | 21/04/2008 | 8 | 76.0 | 16.0 |
| 8 | 21/04/2008 | 9 | NaN | NaN |
| 9 | 21/04/2008 | 10 | NaN | NaN |
| 10 | 21/04/2008 | 11 | NaN | NaN |
| 11 | 21/04/2008 | 12 | 33.0 | 60.0 |
| 12 | 21/04/2008 | 13 | 31.0 | 61.0 |
| 13 | 21/04/2008 | 14 | 37.0 | 61.0 |
| 14 | 21/04/2008 | 15 | 20.0 | 78.0 |
| 15 | 21/04/2008 | 16 | 29.0 | 71.0 |
| 16 | 21/04/2008 | 17 | 30.0 | 70.0 |
| 17 | 21/04/2008 | 18 | 38.0 | 58.0 |
| 18 | 21/04/2008 | 19 | 52.0 | 40.0 |
| 19 | 21/04/2008 | 20 | 56.0 | 29.0 |
| 20 | 21/04/2008 | 21 | 39.0 | 40.0 |
| 21 | 21/04/2008 | 22 | 31.0 | 42.0 |
| 22 | 21/04/2008 | 23 | 29.0 | 42.0 |
| 23 | 21/04/2008 | 24 | 28.0 | 36.0 |
| 24 | 22/04/2008 | 1 | 46.0 | 16.0 |
Data preprocessing
EXERCISE : handling missing values
What is the meaning of “na_values=”n/d” above? Note that an alternative can be obtained with the command polution_df.replace('n/d', np.nan, inplace=True)
# check types
polution_df.dtypes
# check all
polution_df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73920 entries, 0 to 73919
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 date 73920 non-null object
1 heure 73920 non-null object
2 NO2 71008 non-null float64
3 O3 71452 non-null float64
dtypes: float64(2), object(2)
memory usage: 2.3+ MB
For more info on the object nature (inherited from numpy), see https://stackoverflow.com/questions/21018654/strings-in-a-dataframe-but-dtype-is-object
Issues with non-conventional hours/day format
Start by changing to integer type (e.g., int8):
polution_df['heure'] = polution_df['heure'].astype(np.int8)
polution_df['heure']0 1
1 2
2 3
3 4
4 5
..
73915 20
73916 21
73917 22
73918 23
73919 24
Name: heure, Length: 73920, dtype: int8
No data is from 1 to 24… not conventional so let’s make it from 0 to 23
polution_df['heure'] = polution_df['heure'] - 1
polution_df['heure']0 0
1 1
2 2
3 3
4 4
..
73915 19
73916 20
73917 21
73918 22
73919 23
Name: heure, Length: 73920, dtype: int8
and back to strings:
polution_df['heure'] = polution_df['heure'].astype('str')
polution_df['heure']0 0
1 1
2 2
3 3
4 4
..
73915 19
73916 20
73917 21
73918 22
73919 23
Name: heure, Length: 73920, dtype: object
Time processing
Note that we use here the following conventions: - d = day - m=month - Y=year - H=hour - M=minutes
time_improved = pd.to_datetime(polution_df['date'] +
' ' + polution_df['heure'] + ':00',
format='%d/%m/%Y %H:%M')
time_improved0 2008-04-21 00:00:00
1 2008-04-21 01:00:00
2 2008-04-21 02:00:00
3 2008-04-21 03:00:00
4 2008-04-21 04:00:00
...
73915 2016-09-27 19:00:00
73916 2016-09-27 20:00:00
73917 2016-09-27 21:00:00
73918 2016-09-27 22:00:00
73919 2016-09-27 23:00:00
Length: 73920, dtype: datetime64[ns]
polution_df['date'] + ' ' + polution_df['heure'] + ':00'0 21/04/2008 0:00
1 21/04/2008 1:00
2 21/04/2008 2:00
3 21/04/2008 3:00
4 21/04/2008 4:00
...
73915 27/09/2016 19:00
73916 27/09/2016 20:00
73917 27/09/2016 21:00
73918 27/09/2016 22:00
73919 27/09/2016 23:00
Length: 73920, dtype: object
Create correct timing format in the dataframe
polution_df['DateTime'] = time_improved
# remove useless columns:
del polution_df['heure']
del polution_df['date']
polution_df| NO2 | O3 | DateTime | |
|---|---|---|---|
| 0 | 13.0 | 74.0 | 2008-04-21 00:00:00 |
| 1 | 11.0 | 73.0 | 2008-04-21 01:00:00 |
| 2 | 13.0 | 64.0 | 2008-04-21 02:00:00 |
| 3 | 23.0 | 46.0 | 2008-04-21 03:00:00 |
| 4 | 47.0 | 24.0 | 2008-04-21 04:00:00 |
| ... | ... | ... | ... |
| 73915 | 55.0 | 31.0 | 2016-09-27 19:00:00 |
| 73916 | 85.0 | 5.0 | 2016-09-27 20:00:00 |
| 73917 | 75.0 | 9.0 | 2016-09-27 21:00:00 |
| 73918 | 64.0 | 15.0 | 2016-09-27 22:00:00 |
| 73919 | 57.0 | 14.0 | 2016-09-27 23:00:00 |
Visualize the data set now that the time is well formatted:
polution_ts = polution_df.set_index(['DateTime'])
polution_ts = polution_ts.sort_index(ascending=True)
polution_ts.head(12)| NO2 | O3 | |
|---|---|---|
| DateTime | ||
| 2008-04-21 00:00:00 | 13.0 | 74.0 |
| 2008-04-21 01:00:00 | 11.0 | 73.0 |
| 2008-04-21 02:00:00 | 13.0 | 64.0 |
| 2008-04-21 03:00:00 | 23.0 | 46.0 |
| 2008-04-21 04:00:00 | 47.0 | 24.0 |
| 2008-04-21 05:00:00 | 70.0 | 11.0 |
| 2008-04-21 06:00:00 | 70.0 | 17.0 |
| 2008-04-21 07:00:00 | 76.0 | 16.0 |
| 2008-04-21 08:00:00 | NaN | NaN |
| 2008-04-21 09:00:00 | NaN | NaN |
| 2008-04-21 10:00:00 | NaN | NaN |
| 2008-04-21 11:00:00 | 33.0 | 60.0 |
polution_ts.describe()| NO2 | O3 | |
|---|---|---|
| count | 71008.000000 | 71452.000000 |
| mean | 34.453414 | 39.610046 |
| std | 20.380702 | 28.837333 |
| min | 1.000000 | 0.000000 |
| 25% | 19.000000 | 16.000000 |
| 50% | 30.000000 | 38.000000 |
| 75% | 46.000000 | 58.000000 |
| max | 167.000000 | 211.000000 |
fig, axes = plt.subplots(2, 1, figsize=(6, 6), sharex=True)
axes[0].plot(polution_ts['O3'])
axes[0].set_title("Ozone polution: daily average in Paris")
axes[0].set_ylabel("Concentration (µg/m³)")
axes[1].plot(polution_ts['NO2'])
axes[1].set_title("Nitrogen polution: daily average in Paris")
axes[1].set_ylabel("Concentration (µg/m³)")
plt.show()
fig, axes = plt.subplots(2, 1, figsize=(10, 5), sharex=True)
axes[0].plot(polution_ts['O3'].resample('d').max(), '--')
axes[0].plot(polution_ts['O3'].resample('d').min(),'-.')
axes[0].set_title("Ozone polution: daily average in Paris")
axes[0].set_ylabel("Concentration (µg/m³)")
axes[1].plot(polution_ts['NO2'].resample('d').max(), '--')
axes[1].plot(polution_ts['NO2'].resample('d').min(), '-.')
axes[1].set_title("Nitrogen polution: daily average in Paris")
axes[1].set_ylabel("Concentration (µg/m³)")
plt.show()
Source: https://www.tutorialspoint.com/python/time_strptime.htm
EXERCISE : worst of the day
Provide the same plots as before, but with daily best and worst on the same figures (and use different colors and/or styles)
Q: Is the pollution getting better over the years or not?
fig, ax = plt.subplots(1, 1)
polution_ts['2008':].resample('Y').mean().plot(ax=ax)
# Sample by year (A pour Annual) or Y for Year
plt.ylim(0, 50)
plt.title("Pollution evolution: \n yearly average in Paris")
plt.ylabel("Concentration (µg/m³)")
plt.xlabel("Year")
plt.show()
# Load colors
sns.set_palette("GnBu_d", n_colors=7)
polution_ts['weekday'] = polution_ts.index.weekday # Monday=0, Sunday=6
polution_ts['weekend'] = polution_ts['weekday'].isin([5, 6])
polution_week_no2 = polution_ts.groupby(['weekday', polution_ts.index.hour])[
'NO2'].mean().unstack(level=0)
polution_week_03 = polution_ts.groupby(['weekday', polution_ts.index.hour])[
'O3'].mean().unstack(level=0)
plt.show()fig, axes = plt.subplots(2, 1, figsize=(7, 7), sharex=True)
polution_week_no2.plot(ax=axes[0])
axes[0].set_ylabel("Concentration (µg/m³)")
axes[0].set_xlabel("Intraday evolution")
axes[0].set_title(
"Daily NO2 concentration: weekend effect?")
axes[0].set_xticks(np.arange(0, 24))
axes[0].set_xticklabels(np.arange(0, 24), rotation=45)
axes[0].set_ylim(0, 60)
polution_week_03.plot(ax=axes[1])
axes[1].set_ylabel("Concentration (µg/m³)")
axes[1].set_xlabel("Intraday evolution")
axes[1].set_title("Daily O3 concentration: effet du weekend?")
axes[1].set_xticks(np.arange(0, 24))
axes[1].set_xticklabels(np.arange(0, 24), rotation=45)
axes[1].set_ylim(0, 70)
axes[0].legend().set_visible(False)
# ax.legend()
axes[1].legend(labels=[day for day in calendar.day_name], loc='lower left', bbox_to_anchor=(1, 0.1))
plt.tight_layout()
polution_ts['month'] = polution_ts.index.month # Janvier=0, .... Decembre=11
polution_ts['month'] = polution_ts['month'].apply(lambda x:
calendar.month_abbr[x])
polution_ts.head()| NO2 | O3 | weekday | weekend | month | |
|---|---|---|---|---|---|
| DateTime | |||||
| 2008-04-21 00:00:00 | 13.0 | 74.0 | 0 | False | Apr |
| 2008-04-21 01:00:00 | 11.0 | 73.0 | 0 | False | Apr |
| 2008-04-21 02:00:00 | 13.0 | 64.0 | 0 | False | Apr |
| 2008-04-21 03:00:00 | 23.0 | 46.0 | 0 | False | Apr |
| 2008-04-21 04:00:00 | 47.0 | 24.0 | 0 | False | Apr |
polution_month_no2 = polution_ts.groupby(['month', polution_ts.index.hour])[
'NO2'].mean().unstack(level=0)
polution_month_03 = polution_ts.groupby(['month', polution_ts.index.hour])[
'O3'].mean().unstack(level=0)sns.set_palette("Paired", n_colors=12)
fig, axes = plt.subplots(2, 1, figsize=(7, 7), sharex=True)
polution_month_no2.plot(ax=axes[0])
axes[0].set_ylabel("Concentration (µg/m³)")
axes[0].set_xlabel("Heure de la journée")
axes[0].set_title(
"Profil journalier de la pollution au NO2: effet du weekend?")
axes[0].set_xticks(np.arange(0, 24))
axes[0].set_xticklabels(np.arange(0, 24), rotation=45)
axes[0].set_ylim(0, 90)
polution_month_03.plot(ax=axes[1])
axes[1].set_ylabel("Concentration (µg/m³)")
axes[1].set_xlabel("Heure de la journée")
axes[1].set_title("Profil journalier de la pollution au O3: effet du weekend?")
axes[1].set_xticks(np.arange(0, 24))
axes[1].set_xticklabels(np.arange(0, 24), rotation=45)
axes[1].set_ylim(0, 90)
axes[0].legend().set_visible(False)
# ax.legend()
axes[1].legend(labels=calendar.month_name[1:], loc='lower left',
bbox_to_anchor=(1, 0.1))
plt.tight_layout()
Third example (your turn: explore a dataset on bike accidents on your own)
https://www.data.gouv.fr/fr/datasets/accidents-de-velo-en-france/
Possible visualization: https://koumoul.com/en/datasets/accidents-velos
url = "https://koumoul.com/s/data-fair/api/v1/datasets/accidents-velos/raw"
path_target = "./bicycle_db.csv"
path, fname = os.path.split(path_target)
pooch.retrieve(url, path=path, fname=fname, known_hash=None)'/home/jsalmon/Documents/Mes_cours/Montpellier/HAX712X/Courses/Pandas/bicycle_db.csv'
# df: data frame
df_bikes = pd.read_csv("bicycle_db.csv", na_values="", low_memory=False,
dtype={'data': str, 'heure': str, 'departement': str})get_ipython().system('head -5 ./bicycle_db.csv')pd.options.display.max_columns = 40
df_bikes.head()| identifiant accident | date | mois | jour | heure | departement | commune | lat | lon | en agglomeration | type intersection | type collision | luminosite | conditions atmosperiques | type route | circulation | nb voies | profil long route | trace plan route | largeur TPC | largeur route | etat surface | amenagement | situation | categorie usager | gravite accident | sexe | age | motif deplacement | existence securite | usage securite | obstacle fixe heurte | obstacle mobile heurte | localisation choc | manoeuvre avant accident | identifiant vehicule | type autres vehicules | manoeuvre autres vehicules | nombre autres vehicules | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 200500000030 | 2005-01-13 | 01 - janvier | 3 - jeudi | 19 | 62 | 62331 | 50.300 | 2.840 | oui | Hors intersection | Deux véhicules - par le coté | Nuit avec éclairage public allumé | Normale | Route Départementale | NaN | NaN | NaN | Partie rectiligne | NaN | 50.0 | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | M | 57-58 | Promenade - loisirs | NaN | NaN | NaN | Véhicule | Côté gauche | Changeant de file à gauche | 200500000030B02 | Transport en commun | Dépassant à gauche | 1.0 |
| 1 | 200500000034 | 2005-01-19 | 01 - janvier | 2 - mercredi | 10 | 62 | 62022 | 0.000 | 0.000 | non | Hors intersection | Deux véhicules - frontale | Plein jour | Temps éblouissant | Route Départementale | NaN | NaN | Plat | En courbe à droite | NaN | 50.0 | normale | NaN | Sur chaussée | Conducteur | 2 - Blessé hospitalisé | M | 19-20 | Promenade - loisirs | NaN | NaN | NaN | Véhicule | Avant | Sans changement de direction | 200500000034B02 | VU seul 1,5T <= PTAC <= 3,5T avec ou sans remo... | Tournant à gauche | 1.0 |
| 2 | 200500000078 | 2005-01-26 | 01 - janvier | 2 - mercredi | 13 | 02 | 02173 | 0.000 | 0.000 | non | Autre intersection | Deux véhicules - par le coté | Plein jour | Normale | Route Départementale | NaN | 2.0 | Pente | Partie rectiligne | NaN | NaN | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | M | 70-71 | Promenade - loisirs | Casque | Non | NaN | Véhicule | Avant | Sans changement de direction | 200500000078B02 | VL seul | Tournant à gauche | 1.0 |
| 3 | 200500000093 | 2005-01-03 | 01 - janvier | 0 - lundi | 13 | 02 | 02810 | 49.255 | 3.094 | oui | Hors intersection | Deux véhicules - frontale | Plein jour | Normale | Route Départementale | NaN | NaN | Plat | En courbe à gauche | NaN | 52.0 | normale | NaN | Sur chaussée | Conducteur | 2 - Blessé hospitalisé | F | 50-51 | Utilisation professionnelle | NaN | NaN | NaN | Véhicule | Avant gauche | Manœuvre d’évitement | 200500000093B02 | VL seul | Manœuvre d’évitement | 1.0 |
| 4 | 200500000170 | 2005-01-29 | 01 - janvier | 5 - samedi | 18 | 76 | 76196 | 0.000 | 0.000 | non | Hors intersection | Deux véhicules - par l’arrière | Nuit sans éclairage public | Normale | Route Départementale | NaN | 2.0 | Plat | Partie rectiligne | NaN | 50.0 | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | M | 73-74 | Promenade - loisirs | Autre | Oui | NaN | Véhicule | Arrière | Même sens, même file | 200500000170A01 | VU seul 1,5T <= PTAC <= 3,5T avec ou sans remo... | Même sens, même file | 1.0 |
df_bikes['existence securite'].unique()array([nan, 'Casque', 'Autre', 'Equipement réfléchissant', 'Ceinture',
'Dispositif enfants'], dtype=object)
df_bikes['gravite accident'].unique()array(['1 - Blessé léger', '2 - Blessé hospitalisé', '3 - Tué',
'0 - Indemne'], dtype=object)
Handle missing values in heure
df_bikes['date'].hasnans
df_bikes['heure'].hasnansTrue
pd.options.display.max_rows = 20
df_bikes.iloc[400:402]| identifiant accident | date | mois | jour | heure | departement | commune | lat | lon | en agglomeration | type intersection | type collision | luminosite | conditions atmosperiques | type route | circulation | nb voies | profil long route | trace plan route | largeur TPC | largeur route | etat surface | amenagement | situation | categorie usager | gravite accident | sexe | age | motif deplacement | existence securite | usage securite | obstacle fixe heurte | obstacle mobile heurte | localisation choc | manoeuvre avant accident | identifiant vehicule | type autres vehicules | manoeuvre autres vehicules | nombre autres vehicules | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 400 | 200500008935 | 2005-02-13 | 02 - février | 6 - dimanche | NaN | 75 | 75018 | 0.0 | 0.0 | oui | Intersection en X | Deux véhicules - par le coté | Nuit avec éclairage public allumé | Vent fort - tempête | Voie Communale | NaN | 4.0 | Plat | Partie rectiligne | NaN | 120.0 | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | M | 32-33 | Domicile - travail | Casque | Oui | NaN | Véhicule | Avant gauche | Dans le couloir bus, dans le même sens | 200500008935B01 | VL seul | Tournant a droite | 1.0 |
| 401 | 200500008941 | 2005-02-14 | 02 - février | 0 - lundi | 15 | 75 | 75007 | 0.0 | 0.0 | oui | Hors intersection | Deux véhicules - par le coté | Plein jour | Normale | Voie Communale | NaN | 4.0 | Plat | Partie rectiligne | NaN | 120.0 | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | F | 21-22 | Promenade - loisirs | Casque | Oui | NaN | Véhicule | Avant droit | Sans changement de direction | 200500008941A01 | VL seul | En s’insérant | 1.0 |
Remove missing hours cases by np.nan:
df_bikes['heure'] = df_bikes['heure'].replace('', np.nan)
df_bikes.iloc[400:402]| identifiant accident | date | mois | jour | heure | departement | commune | lat | lon | en agglomeration | type intersection | type collision | luminosite | conditions atmosperiques | type route | circulation | nb voies | profil long route | trace plan route | largeur TPC | largeur route | etat surface | amenagement | situation | categorie usager | gravite accident | sexe | age | motif deplacement | existence securite | usage securite | obstacle fixe heurte | obstacle mobile heurte | localisation choc | manoeuvre avant accident | identifiant vehicule | type autres vehicules | manoeuvre autres vehicules | nombre autres vehicules | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 400 | 200500008935 | 2005-02-13 | 02 - février | 6 - dimanche | NaN | 75 | 75018 | 0.0 | 0.0 | oui | Intersection en X | Deux véhicules - par le coté | Nuit avec éclairage public allumé | Vent fort - tempête | Voie Communale | NaN | 4.0 | Plat | Partie rectiligne | NaN | 120.0 | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | M | 32-33 | Domicile - travail | Casque | Oui | NaN | Véhicule | Avant gauche | Dans le couloir bus, dans le même sens | 200500008935B01 | VL seul | Tournant a droite | 1.0 |
| 401 | 200500008941 | 2005-02-14 | 02 - février | 0 - lundi | 15 | 75 | 75007 | 0.0 | 0.0 | oui | Hors intersection | Deux véhicules - par le coté | Plein jour | Normale | Voie Communale | NaN | 4.0 | Plat | Partie rectiligne | NaN | 120.0 | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | F | 21-22 | Promenade - loisirs | Casque | Oui | NaN | Véhicule | Avant droit | Sans changement de direction | 200500008941A01 | VL seul | En s’insérant | 1.0 |
df_bikes.dropna(subset=['heure'], inplace=True)
df_bikes.iloc[399:402]| identifiant accident | date | mois | jour | heure | departement | commune | lat | lon | en agglomeration | type intersection | type collision | luminosite | conditions atmosperiques | type route | circulation | nb voies | profil long route | trace plan route | largeur TPC | largeur route | etat surface | amenagement | situation | categorie usager | gravite accident | sexe | age | motif deplacement | existence securite | usage securite | obstacle fixe heurte | obstacle mobile heurte | localisation choc | manoeuvre avant accident | identifiant vehicule | type autres vehicules | manoeuvre autres vehicules | nombre autres vehicules | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 399 | 200500008875 | 2005-02-10 | 02 - février | 3 - jeudi | 15 | 75 | 75016 | 0.0 | 0.0 | oui | Hors intersection | Deux véhicules - par l’arrière | Plein jour | Normale | Voie Communale | NaN | 4.0 | Plat | Partie rectiligne | NaN | 120.0 | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | M | 53-54 | Promenade - loisirs | NaN | NaN | NaN | Véhicule | Arrière | Sans changement de direction | 200500008875B01 | Bicyclette | Sans changement de direction | 1.0 |
| 401 | 200500008941 | 2005-02-14 | 02 - février | 0 - lundi | 15 | 75 | 75007 | 0.0 | 0.0 | oui | Hors intersection | Deux véhicules - par le coté | Plein jour | Normale | Voie Communale | NaN | 4.0 | Plat | Partie rectiligne | NaN | 120.0 | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | F | 21-22 | Promenade - loisirs | Casque | Oui | NaN | Véhicule | Avant droit | Sans changement de direction | 200500008941A01 | VL seul | En s’insérant | 1.0 |
| 402 | 200500008961 | 2005-02-11 | 02 - février | 4 - vendredi | 12 | 75 | 75005 | 0.0 | 0.0 | oui | Intersection en T | Deux véhicules - par le coté | Plein jour | Normale | Voie Communale | NaN | 2.0 | Plat | Partie rectiligne | NaN | 60.0 | mouillée | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | F | 27-28 | Autre | Casque | Oui | NaN | Véhicule | Avant gauche | Sans changement de direction | 200500008961A01 | Scooter immatriculé | Sans changement de direction | 1.0 |
EXERCISE : Dates?
Can you find the starting day and the ending day of the study automatically?
Hint: sort the data! You can sort the data by time for instance, say with df.sort('Time').
df_bikes['date'] + ' ' + df_bikes['heure'] + ':00'0 2005-01-13 19:00
1 2005-01-19 10:00
2 2005-01-26 13:00
3 2005-01-03 13:00
4 2005-01-29 18:00
...
65976 2018-09-27 8:00
65977 2018-03-21 18:00
65978 2018-03-31 17:00
65979 2018-03-31 17:00
65980 2018-07-31 11:00
Length: 65515, dtype: object
# ADAPT OLD to create the df_bikes['Time']
time_improved = pd.to_datetime(df_bikes['date'] +
' ' + df_bikes['heure'] + ':00',
format='%Y-%m-%d %H:%M')
# Where d = day, m=month, Y=year, H=hour, M=minutes
# create correct timing format in the dataframedf_bikes['Time'] = time_improved
df_bikes.set_index('Time', inplace=True)
# remove useless columns
del df_bikes['heure']
del df_bikes['date']df_bikes.info()<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 65515 entries, 2005-01-13 19:00:00 to 2018-07-31 11:00:00
Data columns (total 37 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 identifiant accident 65515 non-null int64
1 mois 65515 non-null object
2 jour 65515 non-null object
3 departement 65515 non-null object
4 commune 65515 non-null object
5 lat 65515 non-null float64
6 lon 65249 non-null float64
7 en agglomeration 65515 non-null object
8 type intersection 65513 non-null object
9 type collision 65511 non-null object
10 luminosite 65515 non-null object
11 conditions atmosperiques 65512 non-null object
12 type route 65504 non-null object
13 circulation 143 non-null object
14 nb voies 57417 non-null float64
15 profil long route 60966 non-null object
16 trace plan route 59460 non-null object
17 largeur TPC 5142 non-null float64
18 largeur route 39991 non-null float64
19 etat surface 63000 non-null object
20 amenagement 7231 non-null object
21 situation 61378 non-null object
22 categorie usager 65515 non-null object
23 gravite accident 65515 non-null object
24 sexe 65515 non-null object
25 age 65489 non-null object
26 motif deplacement 51958 non-null object
27 existence securite 60160 non-null object
28 usage securite 58292 non-null object
29 obstacle fixe heurte 1434 non-null object
30 obstacle mobile heurte 52601 non-null object
31 localisation choc 56474 non-null object
32 manoeuvre avant accident 58859 non-null object
33 identifiant vehicule 65515 non-null object
34 type autres vehicules 56827 non-null object
35 manoeuvre autres vehicules 52643 non-null object
36 nombre autres vehicules 56827 non-null float64
dtypes: float64(6), int64(1), object(30)
memory usage: 19.0+ MB
df_bike2 = df_bikes.loc[
:, ["gravite accident", "existence securite", "age", "sexe"]
]
df_bike2["existence securite"].replace({"Inconnu": np.nan}, inplace=True)
df_bike2.dropna(inplace=True)EXERCISE: Is the helmet saving your life?
Perform an analysis so that you can check the benefit or not of wearing a helmet to save your life. Beware: preprocessing is needed to use pd.crosstab, pivot_table to avoid issues.
group = df_bike2.pivot_table(columns='existence securite',
index=['gravite accident', 'sexe'],
aggfunc={'age': 'count'}, margins=True)
group| age | |||||||
|---|---|---|---|---|---|---|---|
| existence securite | Autre | Casque | Ceinture | Dispositif enfants | Equipement réfléchissant | All | |
| gravite accident | sexe | ||||||
| 0 - Indemne | F | 187.0 | 298.0 | 8.0 | 1.0 | 33.0 | 527 |
| M | 1121.0 | 1952.0 | 47.0 | 6.0 | 189.0 | 3315 | |
| 1 - Blessé léger | F | 3949.0 | 5143.0 | 48.0 | 21.0 | 815.0 | 9976 |
| M | 9180.0 | 13892.0 | 171.0 | 52.0 | 1912.0 | 25207 | |
| 2 - Blessé hospitalisé | F | 1409.0 | 1977.0 | 236.0 | 9.0 | 310.0 | 3941 |
| M | 4022.0 | 9241.0 | 781.0 | 24.0 | 1147.0 | 15215 | |
| 3 - Tué | F | 87.0 | 164.0 | 32.0 | NaN | 37.0 | 320 |
| M | 291.0 | 1062.0 | 117.0 | 3.0 | 160.0 | 1633 | |
| All | 20246.0 | 33729.0 | 1440.0 | 116.0 | 4603.0 | 60134 | |
pd.crosstab(df_bike2['existence securite'],
df_bike2['gravite accident'], normalize='index') * 100| gravite accident | 0 - Indemne | 1 - Blessé léger | 2 - Blessé hospitalisé | 3 - Tué |
|---|---|---|---|---|
| existence securite | ||||
| Autre | 6.460535 | 64.847377 | 26.825052 | 1.867035 |
| Casque | 6.670817 | 56.435115 | 33.259213 | 3.634854 |
| Ceinture | 3.819444 | 15.208333 | 70.625000 | 10.347222 |
| Dispositif enfants | 6.034483 | 62.931034 | 28.448276 | 2.586207 |
| Equipement réfléchissant | 4.822942 | 59.243971 | 31.653270 | 4.279818 |
pd.crosstab(df_bike2['existence securite'],
df_bike2['gravite accident'], values=df_bike2['age'],
aggfunc='count', normalize='index') * 100| gravite accident | 0 - Indemne | 1 - Blessé léger | 2 - Blessé hospitalisé | 3 - Tué |
|---|---|---|---|---|
| existence securite | ||||
| Autre | 6.460535 | 64.847377 | 26.825052 | 1.867035 |
| Casque | 6.670817 | 56.435115 | 33.259213 | 3.634854 |
| Ceinture | 3.819444 | 15.208333 | 70.625000 | 10.347222 |
| Dispositif enfants | 6.034483 | 62.931034 | 28.448276 | 2.586207 |
| Equipement réfléchissant | 4.822942 | 59.243971 | 31.653270 | 4.279818 |
EXERCISE: Are men and women dying equally on a bike?
Perform an analysis to check differences between men’s and women’s survival.
idx_dead = df_bikes['gravite accident'] == '3 - Tué'
df_deads = df_bikes[idx_dead]
df_gravite = df_deads.groupby('sexe').size() / idx_dead.sum()
df_gravitesexe
F 0.172278
M 0.827722
dtype: float64
df_bikes.groupby('sexe').size() / df_bikes.shape[0]sexe
F 0.248477
M 0.751523
dtype: float64
pd.crosstab(df_bike2['sexe'],
df_bike2['gravite accident'],
values=df_bike2['age'], aggfunc='count',
normalize='columns', margins=True) * 100| gravite accident | 0 - Indemne | 1 - Blessé léger | 2 - Blessé hospitalisé | 3 - Tué | All |
|---|---|---|---|---|---|
| sexe | |||||
| F | 13.716814 | 28.354603 | 20.573189 | 16.385049 | 24.551834 |
| M | 86.283186 | 71.645397 | 79.426811 | 83.614951 | 75.448166 |
To conclude
Note: information on the level of bike practice by men/women is missing…
EXERCISE: Accident during the week?
Perform an analysis to check when the accidents are occurring during the week.
df_bikes| identifiant accident | mois | jour | departement | commune | lat | lon | en agglomeration | type intersection | type collision | luminosite | conditions atmosperiques | type route | circulation | nb voies | profil long route | trace plan route | largeur TPC | largeur route | etat surface | amenagement | situation | categorie usager | gravite accident | sexe | age | motif deplacement | existence securite | usage securite | obstacle fixe heurte | obstacle mobile heurte | localisation choc | manoeuvre avant accident | identifiant vehicule | type autres vehicules | manoeuvre autres vehicules | nombre autres vehicules | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Time | |||||||||||||||||||||||||||||||||||||
| 2005-01-13 19:00:00 | 200500000030 | 01 - janvier | 3 - jeudi | 62 | 62331 | 50.30000 | 2.84000 | oui | Hors intersection | Deux véhicules - par le coté | Nuit avec éclairage public allumé | Normale | Route Départementale | NaN | NaN | NaN | Partie rectiligne | NaN | 50.0 | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | M | 57-58 | Promenade - loisirs | NaN | NaN | NaN | Véhicule | Côté gauche | Changeant de file à gauche | 200500000030B02 | Transport en commun | Dépassant à gauche | 1.0 |
| 2005-01-19 10:00:00 | 200500000034 | 01 - janvier | 2 - mercredi | 62 | 62022 | 0.00000 | 0.00000 | non | Hors intersection | Deux véhicules - frontale | Plein jour | Temps éblouissant | Route Départementale | NaN | NaN | Plat | En courbe à droite | NaN | 50.0 | normale | NaN | Sur chaussée | Conducteur | 2 - Blessé hospitalisé | M | 19-20 | Promenade - loisirs | NaN | NaN | NaN | Véhicule | Avant | Sans changement de direction | 200500000034B02 | VU seul 1,5T <= PTAC <= 3,5T avec ou sans remo... | Tournant à gauche | 1.0 |
| 2005-01-26 13:00:00 | 200500000078 | 01 - janvier | 2 - mercredi | 02 | 02173 | 0.00000 | 0.00000 | non | Autre intersection | Deux véhicules - par le coté | Plein jour | Normale | Route Départementale | NaN | 2.0 | Pente | Partie rectiligne | NaN | NaN | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | M | 70-71 | Promenade - loisirs | Casque | Non | NaN | Véhicule | Avant | Sans changement de direction | 200500000078B02 | VL seul | Tournant à gauche | 1.0 |
| 2005-01-03 13:00:00 | 200500000093 | 01 - janvier | 0 - lundi | 02 | 02810 | 49.25500 | 3.09400 | oui | Hors intersection | Deux véhicules - frontale | Plein jour | Normale | Route Départementale | NaN | NaN | Plat | En courbe à gauche | NaN | 52.0 | normale | NaN | Sur chaussée | Conducteur | 2 - Blessé hospitalisé | F | 50-51 | Utilisation professionnelle | NaN | NaN | NaN | Véhicule | Avant gauche | Manœuvre d’évitement | 200500000093B02 | VL seul | Manœuvre d’évitement | 1.0 |
| 2005-01-29 18:00:00 | 200500000170 | 01 - janvier | 5 - samedi | 76 | 76196 | 0.00000 | 0.00000 | non | Hors intersection | Deux véhicules - par l’arrière | Nuit sans éclairage public | Normale | Route Départementale | NaN | 2.0 | Plat | Partie rectiligne | NaN | 50.0 | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | M | 73-74 | Promenade - loisirs | Autre | Oui | NaN | Véhicule | Arrière | Même sens, même file | 200500000170A01 | VU seul 1,5T <= PTAC <= 3,5T avec ou sans remo... | Même sens, même file | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2018-09-27 08:00:00 | 201800057521 | 09 - septembre | 3 - jeudi | 974 | 97416 | -21.29643 | 55.46052 | non | Hors intersection | Deux véhicules - par le coté | Plein jour | Normale | Voie Communale | NaN | 2.0 | Plat | Partie rectiligne | NaN | NaN | normale | NaN | Sur bande d’arrêt d’urgence | Conducteur | 1 - Blessé léger | M | 25-26 | Domicile - travail | Casque | Oui | NaN | Véhicule | Avant | Même sens, même file | 201800057521B01 | VL seul | Tournant à gauche | 1.0 |
| 2018-03-21 18:00:00 | 201800057582 | 03 - mars | 2 - mercredi | 976 | 97611 | -12.76833 | 45.22532 | oui | Autre intersection | Deux véhicules - par l’arrière | Crépuscule ou aube | Normale | Route Nationale | NaN | 2.0 | Plat | Partie rectiligne | NaN | NaN | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | M | 30-31 | Autre | Casque | Oui | NaN | Véhicule | Avant gauche | En s’insérant | 201800057582B01 | VL seul | Même sens, même file | 1.0 |
| 2018-03-31 17:00:00 | 201800057587 | 03 - mars | 5 - samedi | 976 | 97611 | -12.78604 | 45.22107 | oui | Hors intersection | Deux véhicules - par le coté | Crépuscule ou aube | Normale | Voie Communale | NaN | 2.0 | Sommet de côte | Partie rectiligne | NaN | NaN | normale | NaN | Sur chaussée | Conducteur | 1 - Blessé léger | M | 12-13 | Autre | Autre | Non déterminable | NaN | Véhicule | Côté gauche | NaN | 201800057587B01 | VL seul | Dépassant à gauche | 1.0 |
| 2018-03-31 17:00:00 | 201800057587 | 03 - mars | 5 - samedi | 976 | 97611 | -12.78604 | 45.22107 | oui | Hors intersection | Deux véhicules - par le coté | Crépuscule ou aube | Normale | Voie Communale | NaN | 2.0 | Sommet de côte | Partie rectiligne | NaN | NaN | normale | NaN | Sur chaussée | Passager | 1 - Blessé léger | M | 4-5 | Autre | Autre | Non déterminable | NaN | Véhicule | Côté gauche | NaN | 201800057587B01 | VL seul | Dépassant à gauche | 1.0 |
| 2018-07-31 11:00:00 | 201800057676 | 07 - juillet | 1 - mardi | 976 | 97611 | -12.77302 | 45.22106 | oui | Hors intersection | Autre | Plein jour | Normale | Voie Communale | NaN | 2.0 | Plat | Partie rectiligne | NaN | NaN | normale | NaN | Sur accotement | Piéton | 1 - Blessé léger | M | 3-4 | NaN | Dispositif enfants | NaN | NaN | Piéton | Avant | NaN | 201800057676A01 | NaN | NaN | NaN |
# Chargement des couleurs
sns.set_palette("GnBu_d", n_colors=7)
df_bikes['weekday'] = df_bikes.index.day_of_week # Monday=0, Sunday=6
accidents_week = df_bikes.groupby(['weekday', df_bikes.index.hour])[
'sexe'].count().unstack(level=0)
fig, axes = plt.subplots(1, 1, figsize=(7, 7))
accidents_week.plot(ax=axes)
axes.set_ylabel("Accidents")
axes.set_xlabel("Heure de la journée")
axes.set_title(
"Profil journalier des accidents: effet du weekend?")
axes.set_xticks(np.arange(0, 24))
axes.set_xticklabels(np.arange(0, 24), rotation=45)
# axes.set_ylim(0, 6)
axes.legend(
labels=[day for day in calendar.day_name],
loc='upper left',
)
plt.tight_layout()
df_bikes.groupby(['weekday', df_bikes.index.hour])[
'sexe'].count()weekday Time
0 1 20
2 4
3 3
4 20
5 41
...
6 19 340
20 217
21 122
22 67
23 59
Name: sexe, Length: 161, dtype: int64
EXERCISE: Accident during the year?
Perform an analysis to check when the accidents are occurring during the week.
df_bikes['month'] = df_bikes.index.month # Janvier=0, .... Decembre=11
df_bikes['month'] = df_bikes['month'].apply(lambda x: calendar.month_abbr[x])
df_bikes.head()
sns.set_palette("GnBu_d", n_colors=12) # sns.set_palette("colorblind",...)
df_bikes_month = df_bikes.groupby(['month', df_bikes.index.hour])[
'age'].count().unstack(level=0)
fig, axes = plt.subplots(1, 1, figsize=(7, 7), sharex=True)
df_bikes_month.plot(ax=axes)
axes.set_ylabel("Concentration (µg/m³)")
axes.set_xlabel("Heure de la journée")
axes.set_title(
"Profil journalier de la pollution au NO2: effet du weekend?")
axes.set_xticks(np.arange(0, 24))
axes.set_xticklabels(np.arange(0, 24), rotation=45)
# axes.set_ylim(0, 90)
axes.legend(labels=calendar.month_name[1:], loc='lower left',
bbox_to_anchor=(1, 0.1))
plt.tight_layout()
EXERCISE : Accidents by department
Perform an analysis to check when the accidents are occurring for each department.
# import pygal
# First install if needed for maps:
# pip install pygal
# andpip install pygal_maps_frpip install pygal_maps_fr
# pip install pygal_maps_fr
# Departement population: https://public.opendatasoft.com/explore/dataset/population-francaise-par-departement-2018/table/?disjunctive.departement&location=7,47.12995,3.41125&basemap=jawg.streets
path_target = "./dpt_population.csv"
url = "https://public.opendatasoft.com/explore/dataset/population-francaise-par-departement-2018/download/?format=csv&timezone=Europe/Berlin&lang=en&use_labels_for_header=true&csv_separator=%3B"
path, fname = os.path.split(path_target)
pooch.retrieve(url, path=path, fname=fname, known_hash=None)'/home/jsalmon/Documents/Mes_cours/Montpellier/HAX712X/Courses/Pandas/dpt_population.csv'
# Departement area: https://www.regions-et-departements.fr/departements-francais#departements_fichiers
path_target = "./dpt_area.csv"
url = "https://www.regions-et-departements.fr/fichiers/departements-francais.csv"
path, fname = os.path.split(path_target)
pooch.retrieve(url, path=path, fname=fname, known_hash=None)
df_dtp_pop = pd.read_csv("dpt_population.csv", sep=";", low_memory=False)
df_dtp_pop['Code Département'].replace('2A', '20A',inplace=True)
df_dtp_pop['Code Département'].replace('2B', '20B',inplace=True)
df_dtp_pop.sort_values(by=['Code Département'], inplace=True)
df_bikes['departement'].replace('2A', '20A',inplace=True)
df_bikes['departement'].replace('2B', '20B',inplace=True)
df_bikes.sort_values(by=['departement'], inplace=True)
gd = df_bikes.groupby(['departement'], as_index=True, sort=True).size()
data = {'code': gd.index,
'# Accidents per million': gd.values}
df = pd.DataFrame(data)
df['# Accidents per million'] = df['# Accidents per million'].values * 10000./ df_dtp_pop['Population'].valuesimport plotly.express as px
import geopandas
departement = geopandas.read_file('departement_avec_outremer_rapprochée.geojson')
departement['code'].replace('2A', '20A', inplace=True)
departement['code'].replace('2B', '20B', inplace=True)
departement.sort_values(by=['code'], inplace=True)
a = ['0'+ str(i) for i in range(1, 10)]
b = [str(i) for i in range(1, 10)]
dict_replace = dict(zip(a, b))
departement['code'].replace(dict_replace, inplace=True)
df['code'].replace(dict_replace, inplace=True)
departement['code'].replace('20A', '2A', inplace=True)
departement['code'].replace('20B', '2B', inplace=True)
df['code'].replace('20A', '2A',inplace=True)
df['code'].replace('20B', '2B',inplace=True)
departement.set_index('code', inplace=True)
fig = px.choropleth_mapbox(
df,
geojson=departement,
locations="code",
color="# Accidents per million",
range_color=(0, df['# Accidents per million'].max()),
color_continuous_scale="rdbu",
center={'lat': 47, 'lon': 2},
zoom=3.25,
mapbox_style="white-bg",
)
fig.show()Sources:
https://www.kaggle.com/code/scratchpad/notebook670ec82922/edit
https://towardsdatascience.com/choropleth-maps-in-practice-with-plotly-and-python-672a5eef3a19
https://data.europa.eu/data/datasets/5fb11a6b13b6030b71898d44?locale=fr